What’s covered in this lecture?
R:rvest for web data retrieval, e.g, the global 500 brands from BrandFinance.com.The URL: http://brandirectory.com/league_tables/table/global-500-2018
view page source.Inspect the pages to identify the data for collection.library(rvest)
webpage <- read_html("http://brandirectory.com/league_tables/table/global-500-2018")
xdata <- webpage %>% html_nodes("tbody tr")
DataX = NULL
for (i in 1:length(xdata)){
tmp <- xdata[i] %>% html_nodes("td")
rank17 <- tmp[1] %>% html_nodes("span") %>% html_text()
rank16 <- tmp[2] %>% html_text() %>% as.numeric()
logo <- tmp[3] %>% html_nodes("img") %>% xml_attr("src")
company <- tmp[4] %>% html_text()
flag <- tmp[5] %>% html_nodes("img") %>% xml_attr("src")
value17 <- tmp[6] %>% html_nodes(".h") %>% html_text()
if(!length( value17)) value17 = NA
value16 <- tmp[7] %>% html_nodes(".h") %>% html_text()
if(!length(value16)) value16 = NA
rate17 <- tmp[8] %>% html_nodes("span") %>% html_text()
if(!length(rate17)) rate17 = NA
rate16 <- tmp[9] %>% html_nodes("span") %>% html_text()
if(!length(rate16)) rate16 = NA
DataX = rbind(DataX, c(rank17, rank16, company, logo, flag,
value17,value16, rate17, rate16))
}
# xname <- webpage %>% html_nodes(".col-sm-9 .main th") %>% html_text()
colnames(DataX) = c("Rank17", "Rank16", "Company", "Logo", "Flag",
"Value17", "Value16", "Rate17", "Rate16")
DataX = as.data.frame(DataX)
knitr::kable(head(DataX), format="html")
write.csv(DataX, file="TopBrand2018.csv", row.names=F)DataX = read.csv("TopBrand2018.csv")
summary(DataX) # Before preprocessing ... to detect the data problems ... ## Rank17 Rank16 Company
## Min. : 1.0 Min. : 1.0 CVS Health : 1
## 1st Qu.:125.8 1st Qu.:115.5 Sumitomo Mitsui Financial Group: 1
## Median :250.5 Median :231.0 20th Century Fox : 1
## Mean :250.5 Mean :234.0 3 Mobile : 1
## 3rd Qu.:375.2 3rd Qu.:346.5 3M : 1
## Max. :500.0 Max. :500.0 7-Eleven : 1
## NA's :41 (Other) :494
## Logo
## /images/profile/logo/2000px_macys_logo_cms.jpg : 1
## /images/profile/logo/2000px_morgan_stanley_logo_1_cms.jpg: 1
## /images/profile/logo/2000px_youtube_logo_2017_cms.jpg : 1
## /images/profile/logo/20th_century_fox_logo.jpg : 1
## /images/profile/logo/3_mobile_3.png : 1
## /images/profile/logo/3m.jpg : 1
## (Other) :494
## Flag Value17 Value16 Rate17
## /images/flags/us.png:193 Min. : 14635 Min. : 0 Min. :17.00
## /images/flags/cn.png: 60 1st Qu.: 18537 1st Qu.: 16242 1st Qu.:21.00
## /images/flags/jp.png: 36 Median : 22246 Median : 21944 Median :22.00
## /images/flags/fr.png: 35 Mean : 32536 Mean : 28122 Mean :21.92
## /images/flags/gb.png: 29 3rd Qu.: 37502 3rd Qu.: 32032 3rd Qu.:23.00
## /images/flags/de.png: 24 Max. :150811 Max. :109470 Max. :24.00
## (Other) :123 NA's :400 NA's :400 NA's :400
## Rate16
## Min. : 0.00
## 1st Qu.:21.00
## Median :22.00
## Mean :21.46
## 3rd Qu.:23.00
## Max. :24.00
## NA's :400
DataX$Company = as.character(DataX$Company)
DataX$Logo = as.character(DataX$Logo)
DataX$Flag = as.character(DataX$Flag)
DataX$Rank17 = as.numeric(DataX$Rank17)
DataX$Rank16 = as.numeric(DataX$Rank16)
DataX$Value17 = as.numeric(DataX$Value17)
DataX$Value16 = as.numeric(DataX$Value16)
DataX$Rate17 = as.numeric(DataX$Rate17)
DataX$Rate16 = as.numeric(DataX$Rate16)
summary(DataX) # After preprocessing ... thus develop the data sense ... ## Rank17 Rank16 Company Logo Flag
## Min. : 1.0 Min. : 1.0 Length:500 Length:500 Length:500
## 1st Qu.:125.8 1st Qu.:115.5 Class :character Class :character Class :character
## Median :250.5 Median :231.0 Mode :character Mode :character Mode :character
## Mean :250.5 Mean :234.0
## 3rd Qu.:375.2 3rd Qu.:346.5
## Max. :500.0 Max. :500.0
## NA's :41
## Value17 Value16 Rate17 Rate16
## Min. : 14635 Min. : 0 Min. :17.00 Min. : 0.00
## 1st Qu.: 18537 1st Qu.: 16242 1st Qu.:21.00 1st Qu.:21.00
## Median : 22246 Median : 21944 Median :22.00 Median :22.00
## Mean : 32536 Mean : 28122 Mean :21.92 Mean :21.46
## 3rd Qu.: 37502 3rd Qu.: 32032 3rd Qu.:23.00 3rd Qu.:23.00
## Max. :150811 Max. :109470 Max. :24.00 Max. :24.00
## NA's :400 NA's :400 NA's :400 NA's :400
DataX$Country = gsub("/images/flags/", "", DataX$Flag)
DataX$Country = gsub(".png", "", DataX$Country)
DataX$Country = as.factor(DataX$Country)
barplot(sort(summary(DataX$Country), decreasing = T)[1:10],
col=5, main="Top 10 Countries with Top Brands")library(curl)
if(!dir.exists("images")) dir.create("images")
if(!dir.exists("images/profile")) dir.create("images/profile")
if(!dir.exists("images/profile/logo")) dir.create("images/profile/logo")
for (i in 1:nrow(DataX))
curl_download(url = paste("http://brandirectory.com", DataX$Logo[i], sep=""),
destfile = paste("./", DataX$Logo[i], sep=""))if(!dir.exists("images/flags")) dir.create("images/flags")
flaglist = unique(DataX$Flag)
for (i in 1:length(flaglist))
curl_download(url = paste("http://brandirectory.com", flaglist[i], sep=""),
destfile = paste("./", flaglist[i], sep=""))Step 1: Start with data loading and preprocessing. For simplicity, we omit missing values.
DataX = read.csv("BrandFinance.csv")
DataX = DataX[, c("Year", "Rank", "RankLastyear", "Company", "Value", "Rate", "Country", "Sector")]
DataX$Value[DataX$Value==-1] = NA
DataX$Rate[DataX$Rate==-1] = NA
DataX$RankLastyear = as.numeric(as.character(DataX$RankLastyear))
DataX = na.omit(DataX)
summary(DataX) ## Year Rank RankLastyear Company Value
## Min. :2009 Min. : 1.0 Min. : 1.00 Allianz : 9 Min. : 3955
## 1st Qu.:2011 1st Qu.: 25.0 1st Qu.: 25.00 Amazon.com : 9 1st Qu.: 12475
## Median :2013 Median : 49.5 Median : 50.00 American Express: 9 Median : 16607
## Mean :2013 Mean : 49.9 Mean : 55.51 Apple : 9 Mean : 20498
## 3rd Qu.:2015 3rd Qu.: 75.0 3rd Qu.: 77.75 AT&T : 9 3rd Qu.: 23007
## Max. :2017 Max. :100.0 Max. :391.00 Bank of America : 9 Max. :145918
## (Other) :824
## Rate Country Sector
## Min. : 0.00 us :403 Banks :159
## 1st Qu.:20.00 jp : 90 Technology :150
## Median :21.00 cn : 78 Telecommunications:105
## Mean :20.98 de : 73 Retail : 90
## 3rd Qu.:22.00 gb : 54 Auto Manufacturers: 70
## Max. :24.00 fr : 51 Oil&Gas : 54
## (Other):129 (Other) :250
tmp = sort(summary(DataX$Sector), decreasing = T)[1:5]
barplot(tmp, col=5, space=0, xaxt = "n", yaxt="n",
main="Top 5 sectors with Top 100 Brands")
text(x= c(1:length(tmp))-0.6, 2, names(tmp), cex=1.2, pos=4, srt=90, xpd=TRUE)TopSector = names(tmp)Step 2: Think creatively how the top brand ranking/values can be visualized …
levels(DataX$Sector)[!is.element(levels(DataX$Sector),TopSector)] = "Others"
DataX$Sector = factor(DataX$Sector, c(TopSector, "Others"))
Colmap = adjustcolor(1+seq(1,nlevels(DataX$Sector)), alpha.f=0.6)
xlim0=c(-10, max(DataX$RankLastyear))
ylim0=c(-5,max(DataX$Rank))
BubblePlot <- function(DataX,Year){
TmpX = DataX[DataX$Year == Year, ]
TmpX$Country = factor(TmpX$Country)
Size = 1+9*(TmpX$Value-min(DataX$Value))/diff(range(DataX$Value))
par(mar=c(4,4,3,3))
plot(TmpX$RankLastyear, TmpX$Rank,
xlim=xlim0, ylim=rev(ylim0),
pch=20, col=Colmap[TmpX$Sector], cex=Size,
xlab = paste(Year-1, "Ranking"), ylab = paste(Year, "Ranking"),
main = paste("Year", Year))
legend("topright", levels(DataX$Sector), pch=20, col=Colmap)
}
BubblePlot(DataX, 2017)Step 3: Generate the animation with year frames
library(magick)
ListYear = sort(unique(DataX$Year))
Img <- image_graph(500, 500, res = 72)
for (k in 1:length(ListYear)) BubblePlot(DataX, ListYear[k])
dev.off()
Img %>% image_trim() %>% image_animate(fps = 1) %>% image_write("TopBrands.gif")Step 4: You are right, we are talking about Plotly …
library(plotly)
Year = 2017
TmpX = DataX[DataX$Year == Year, ]
TmpX$Country = factor(TmpX$Country)
TmpX$Size = 1+19*(TmpX$Value-min(DataX$Value))/diff(range(DataX$Value))
plot_ly(TmpX, x = ~RankLastyear, y = ~Rank, type="scatter", mode = "markers",
size = ~Size, color = ~Sector,
hoverinfo = 'text',
text = ~paste("", Company)) %>%
layout(xaxis = list(range = xlim0,
zeroline=FALSE,
title=paste(Year-1, "Ranking")),
yaxis = list(range = ylim0,
autorange="reversed",
zeroline=FALSE,
title=paste(Year, "Ranking"))
)library(ggplot2)
gg <- ggplot(DataX, aes(RankLastyear, Rank, color = Sector, text=Company)) +
geom_point(aes(size = Value, frame = Year)) +
scale_y_reverse(lim=c(100,-5))
ggplotly(gg)